source('../settings/settings.R')
source('commonFunctions.R')
set.seed(43)
drive1 <- read.csv('../data/processed/analysis/TT1_Drive_1_PP.csv')
drive2 <- read.csv('../data/processed/Analysis/TT1_Drive_2_PP.csv')
drive3 <- read.csv('../data/processed/Analysis/TT1_Drive_3_PP.csv')
drive4 <- read.csv('../data/processed/Analysis/TT1_Drive_4_PP.csv', stringsAsFactors = T)
dfSeg <- data.frame(rep(1, nrow(drive4)), rep(2, nrow(drive4)), rep(3, nrow(drive4)), rep(4, nrow(drive4)))
names(dfSeg) <- c("Seg1", "Seg2", "Seg3", "Seg4")

combinedDf_Seg1 <- cbind(drive4, 
                    drive1$MeanPP_Seg0, 
                    drive2$MeanPP_Seg1, drive3$MeanPP_Seg1, 
                    drive2$MeanPP_Seg0_1, drive3$MeanPP_Seg0_1,
                    drive2$StdPP_Seg1, drive3$StdPP_Seg1,
                    drive2$StdPP_Seg0_1, drive3$StdPP_Seg0_1,
                    dfSeg$Seg1
                  )
combinedDf_Seg2 <- cbind(drive4, 
                    drive1$MeanPP_Seg0, 
                    drive2$MeanPP_Seg2, drive3$MeanPP_Seg2, 
                    drive2$MeanPP_Seg0_2, drive3$MeanPP_Seg0_2,
                    drive2$StdPP_Seg2, drive3$StdPP_Seg2,
                    drive2$StdPP_Seg0_2, drive3$StdPP_Seg0_2,
                    dfSeg$Seg2
                  )
combinedDf_Seg3 <- cbind(drive4, 
                    drive1$MeanPP_Seg0, 
                    drive2$MeanPP_Seg3, drive3$MeanPP_Seg3, 
                    drive2$MeanPP_Seg0_3, drive3$MeanPP_Seg0_3,
                    drive2$StdPP_Seg3, drive3$StdPP_Seg3,
                    drive2$StdPP_Seg0_3, drive3$StdPP_Seg0_3,
                    dfSeg$Seg3
                  )
combinedDf_Seg4 <- cbind(drive4, 
                    drive1$MeanPP_Seg0, 
                    drive2$MeanPP_Seg4, drive3$MeanPP_Seg4, 
                    drive2$MeanPP_Seg0_4, drive3$MeanPP_Seg0_4,
                    drive2$StdPP_Seg4, drive3$StdPP_Seg4,
                    drive2$StdPP_Seg0_4, drive3$StdPP_Seg0_4,
                    dfSeg$Seg4
                  )

names(combinedDf_Seg1) <- c(names(drive4), 
                       "PP_Dev_1_Turning",
                       "PP_Dev_2_Straight", "PP_Dev_3_Straight", 
                       "PP_Dev_2_Turning", "PP_Dev_3_Turning", 
                       "Std_PP_2_Straight", "Std_PP_3_Straight", 
                       "Std_PP_2_Turning", "Std_PP_3_Turning", 
                       "Segment")
names(combinedDf_Seg2) <- c(names(drive4), 
                       "PP_Dev_1_Turning",
                       "PP_Dev_2_Straight", "PP_Dev_3_Straight", 
                       "PP_Dev_2_Turning", "PP_Dev_3_Turning", 
                       "Std_PP_2_Straight", "Std_PP_3_Straight", 
                       "Std_PP_2_Turning", "Std_PP_3_Turning", 
                       "Segment")
names(combinedDf_Seg3) <- c(names(drive4), 
                       "PP_Dev_1_Turning",
                       "PP_Dev_2_Straight", "PP_Dev_3_Straight", 
                       "PP_Dev_2_Turning", "PP_Dev_3_Turning", 
                       "Std_PP_2_Straight", "Std_PP_3_Straight", 
                       "Std_PP_2_Turning", "Std_PP_3_Turning", 
                       "Segment")
names(combinedDf_Seg4) <- c(names(drive4), 
                       "PP_Dev_1_Turning",
                       "PP_Dev_2_Straight", "PP_Dev_3_Straight", 
                       "PP_Dev_2_Turning", "PP_Dev_3_Turning", 
                       "Std_PP_2_Straight", "Std_PP_3_Straight", 
                       "Std_PP_2_Turning", "Std_PP_3_Turning", 
                       "Segment")

# combinedDf_Seg1$Subject <- paste0(as.factor(combinedDf_Seg1$Subject), ".S1")
# combinedDf_Seg2$Subject <- paste0(as.factor(combinedDf_Seg2$Subject), ".S2")
# combinedDf_Seg3$Subject <- paste0(as.factor(combinedDf_Seg3$Subject), ".S3")
# combinedDf_Seg4$Subject <- paste0(as.factor(combinedDf_Seg4$Subject), ".S4")

combinedDf <- rbind(combinedDf_Seg1, combinedDf_Seg2, combinedDf_Seg3, combinedDf_Seg4)

combinedDf$Subject <- paste0("#", str_pad(combinedDf$Subject, 2, pad="0"))
combinedDf$Segment <- as.factor(combinedDf$Segment)
combinedDf_NoStressor <- combinedDf[combinedDf$Activity == "NO",]
combinedDf_Cognitive <- combinedDf[combinedDf$Activity == "C",]
combinedDf_Motoric <- combinedDf[combinedDf$Activity == "M",]

# combinedDf_NoStressor$Subject <- as.factor(combinedDf_NoStressor$Subject)
# combinedDf_Cognitive$Subject <- as.factor(combinedDf_Cognitive$Subject)
# combinedDf_Motoric$Subject <- as.factor(combinedDf_Motoric$Subject)
COLOR_NORMAL <- list(color='rgb(120,120,120)')
COLOR_COGNITIVE <- list(color='rgb(158,202,225)')
COLOR_MOTORIC <- list(color='rgb(58,200,225)')
COLOR_FAILURE <- list(color='red')

THRESHOLD_MILD = 0.07
THRESHOLD_EXTREME = 0.2

MARKER_LINE_MILD = list(color="blue")
MARKER_LINE_EXTREME = list(color="red")
library(nlme)
combinedDf$Subject = as.factor(combinedDf$Subject)
combinedDf$Activity = as.factor(combinedDf$Activity)
combinedDf$PP_Dev_Group = ifelse(combinedDf$PP_Dev > THRESHOLD_MILD, 1, 0)
model = lm(PP_Dev ~ 
              abs(PP_Dev_2_Straight) + 
              abs(PP_Dev_3_Straight) +
              abs(PP_Dev_2_Turning) + 
              abs(PP_Dev_3_Turning) + 
              Std_PP_2_Straight + 
              Std_PP_3_Straight + 
              Std_PP_2_Turning +
              Std_PP_3_Turning +
              factor(Activity),
            data=combinedDf,
            random=~1|as.factor(Subject),
            method="REML")
method = 'REML' is not supported. Using 'qr'In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
 extra argument ‘random’ will be disregarded
# anova(model)
summary(model)

Call:
lm(formula = PP_Dev ~ abs(PP_Dev_2_Straight) + abs(PP_Dev_3_Straight) + 
    abs(PP_Dev_2_Turning) + abs(PP_Dev_3_Turning) + Std_PP_2_Straight + 
    Std_PP_3_Straight + Std_PP_2_Turning + Std_PP_3_Turning + 
    factor(Activity), data = combinedDf, method = "REML", random = ~1 | 
    as.factor(Subject))

Residuals:
      Min        1Q    Median        3Q       Max 
-0.113697 -0.050326  0.003557  0.042924  0.143706 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)             0.064925   0.027247   2.383 0.019785 *  
abs(PP_Dev_2_Straight) -0.002686   0.076487  -0.035 0.972081    
abs(PP_Dev_3_Straight) -0.211257   0.143242  -1.475 0.144561    
abs(PP_Dev_2_Turning)   0.242065   0.068999   3.508 0.000776 ***
abs(PP_Dev_3_Turning)   0.080835   0.110406   0.732 0.466411    
Std_PP_2_Straight       0.397988   0.169945   2.342 0.021919 *  
Std_PP_3_Straight       0.518084   0.229902   2.253 0.027231 *  
Std_PP_2_Turning       -0.502623   0.322749  -1.557 0.123720    
Std_PP_3_Turning       -0.773228   0.289183  -2.674 0.009247 ** 
factor(Activity)M       0.090896   0.018619   4.882 6.01e-06 ***
factor(Activity)NO     -0.048262   0.019559  -2.468 0.015948 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.06229 on 73 degrees of freedom
Multiple R-squared:  0.5391,    Adjusted R-squared:  0.4759 
F-statistic: 8.537 on 10 and 73 DF,  p-value: 4.712e-09
plot(model)

Machine Learning

combinedDf$PP_Dev <- NULL

combinedDf$Subject <- NULL
combinedDf$Activity_NO <- ifelse(combinedDf$Activity == "NO", 1, 0)
combinedDf$Activity_C <- ifelse(combinedDf$Activity == "C", 1, 0)
combinedDf$Activity_M <- ifelse(combinedDf$Activity == "M", 1, 0)
combinedDf$Activity <- NULL

combinedDf$PP_Dev_1_Turning <-NULL
combinedDf$PP_Prior <-NULL

combinedDf$InSegment1 <- ifelse(combinedDf$Segment == 1, 1, 0)
combinedDf$InSegment2 <- ifelse(combinedDf$Segment == 2, 1, 0)
combinedDf$InSegment3 <- ifelse(combinedDf$Segment == 3, 1, 0)
combinedDf$InSegment4 <- ifelse(combinedDf$Segment == 4, 1, 0)
combinedDf$Segment <- NULL

combinedDf$Class <- ifelse(combinedDf$PP_Dev_Group == 1, T, F)
combinedDf$PP_Dev_Group <- NULL
# library(mefa)
# combinedDf <- rep(combinedDf, 10) 
n_folds <- 5
params <- param <- list(objective       = "binary:logistic", 
               booster          = "gbtree",
               eval_metric      = "auc",
               eta              = 0.1,
               max_depth        = 15,
               alpha            = 1,
               lambda           = 0,
               gamma            = 0.45,
               min_child_weight = 0.3,
               subsample        = 1,
               colsample_bytree = 1)
           
# XGBoost Model         
xgb_m <- xgb.cv(   params               = param,
                  data = as.matrix(combinedDf %>% select(-Class)) ,
                  label =  combinedDf$Class,
                  nrounds             = 100,
                  verbose             = F,
                  prediction          = T,
                  maximize            = F, # Change this value to F will help to run with more itineration
                  nfold               = n_folds,
                  metrics             = c("auc", "error"),
                  early_stopping_rounds = 50,
                  stratified            = T,
                  scale_pos_weight      = 1/2)

# xgb_m$evaluation_log[xgb_m$best_iteration,"test_auc_mean"]
xgb_m$evaluation_log[xgb_m$best_iteration,]
NA
# Prediction
combinedDf$clsPred <- round(xgb_m$pred)

computePerformanceResults <- function(sdat){
  sdat = sdat[complete.cases(sdat),]
  acc = sum(sdat[,1] == sdat[,2])/nrow(sdat)
  conf_mat = table(sdat)
  specif = conf_mat[1,1]/sum(conf_mat[,1])
  sensiv = conf_mat[2,2]/sum(conf_mat[,2])
  preci =  conf_mat[2,2]/sum(conf_mat[2,])
  npv =    conf_mat[1,1]/sum(conf_mat[1,])
  return(c(acc,specif,sensiv,preci,npv))
}

# Get average performance
performance <- computePerformanceResults(combinedDf %>% select(Class, clsPred))
acc <- performance[1]
prec <- performance[4]
recall <- performance[3]
spec <- performance[2]
npv <- performance[5]
f1 <- (2 * recall * prec) / (recall + prec)
auc <- as.numeric(xgb_m$evaluation_log[xgb_m$best_iteration, "test_auc_mean"])

print(paste("Accuracy=", round(acc, 2)))
[1] "Accuracy= 0.81"
print(paste("Precision=", round(prec, 2)))
[1] "Precision= 0.79"
print(paste("Recall=", round(recall, 2)))
[1] "Recall= 0.89"
print(paste("Specificity=", round(spec, 2)))
[1] "Specificity= 0.71"
print(paste("NPV=", round(npv, 2)))
[1] "NPV= 0.84"
print(paste("F1=", round(f1, 2)))
[1] "F1= 0.84"
print(paste("AUC=", round(auc, 2)))
[1] "AUC= 0.91"
# Importance
combinedDf$clsPred <- NULL
bst <- xgboost(   params               = param,
                  data = as.matrix(combinedDf %>% select(-Class)) ,
                  label =  combinedDf$Class,
                  nrounds             = 100,
                  verbose             = F,
                  prediction          = T,
                  maximize            = F, # Change this value to F will help to run with more itineration
                  nfold               = n_folds,
                  metrics             = c("auc", "error"),
                  early_stopping_rounds = 50,
                  stratified            = T,
                  scale_pos_weight      = 1/2)
importanceDf <- xgb.importance(colnames(combinedDf %>% select(-Class) ), model = bst)
print(importanceDf)
library(pROC)

dfROC <- pROC::roc(response = ifelse(combinedDf$Class==T, 1, 0),
               predictor = round(xgb_m$pred),
               levels=c(0, 1), direction = "<")

# it = which.max(xgb_m$evaluation_log$test_auc_mean)
# best.iter = xgb_m$evaluation_log$iter[it]
# best.iter 

plot(pROC::roc(response = ifelse(combinedDf$Class==T, 1, 0),
               predictor = round(xgb_m$pred),
               levels=c(0, 1), direction = "<"), 
     legacy.axes = TRUE,
     main="ROC Curve", 
     lwd=1.5) 

Important features

# Eleminate #5 who has an exceptional data to find a better threshold
stdPP3 <- sort(importantFeaturesDf$Std_PP_3, decreasing = T)[2:length(importantFeaturesDf$Std_PP_3)]
stdPP3Array <- matrix(stdPP3 ,nrow = 1,ncol = length(stdPP3))
  
maxStdPP3 <- sort(importantFeaturesDf$Std_PP_3, decreasing = T)[2]
PP_DEV_3_THRESHOLD <- otsu(stdPP3Array, range=c(min(stdPP3), maxStdPP3)) # Expected Threshold = 0.088
print(paste0('Threshold: ', PP_DEV_3_THRESHOLD))
[1] "Threshold: 0.0881111526518663"
importantFeaturesDf$PP_Dev_3_Group <- ifelse(importantFeaturesDf$Std_PP_3 > PP_DEV_3_THRESHOLD, 1, 0)
write.csv(importantFeaturesDf, "../outputs/importantFeatures.csv")

Plot feature importance

yAxis <- list(
  title = 'Importance',
  range=c(0.0, 1.0)
)
xAxis <- list(
  title = ''
)
importanceDf$FeatureName <- lapply(importanceDf$Feature, function(x) {
  ifelse(x=="Std_PP_3", "SD of Arousal\n in Drive M", 
         ifelse(x=="PP_Dev_2_Turning", "Arousal in Drive C\nat turning segments", 
            ifelse(x=="Activity_C", "Prior stressor\n is Cognitive", x)))
})

fig_Importance <- plot_ly(importanceDf, x = ~FeatureName, y = ~Gain, type = 'bar', name = 'Gain', width=600) %>%
  add_trace(y = ~Cover, name = 'Cover') %>% 
  add_trace(y = ~Frequency, name = 'Frequency') %>% 
  layout(yaxis = yAxis, xaxis=xAxis, barmode = 'group', title="Feature Importance") %>% 
  config(.Last.value, mathjax = 'cdn')

htmltools::tagList(fig_Importance)
---
title: "R Notebook"
output: html_notebook
---

```{r}
source('../settings/settings.R')
source('commonFunctions.R')
```

```{r}
set.seed(43)
drive1 <- read.csv('../data/processed/analysis/TT1_Drive_1_PP.csv')
drive2 <- read.csv('../data/processed/Analysis/TT1_Drive_2_PP.csv')
drive3 <- read.csv('../data/processed/Analysis/TT1_Drive_3_PP.csv')
drive4 <- read.csv('../data/processed/Analysis/TT1_Drive_4_PP.csv', stringsAsFactors = T)
```

```{r}
dfSeg <- data.frame(rep(1, nrow(drive4)), rep(2, nrow(drive4)), rep(3, nrow(drive4)), rep(4, nrow(drive4)))
names(dfSeg) <- c("Seg1", "Seg2", "Seg3", "Seg4")

combinedDf_Seg1 <- cbind(drive4, 
                    drive1$MeanPP_Seg0, 
                    drive2$MeanPP_Seg1, drive3$MeanPP_Seg1, 
                    drive2$MeanPP_Seg0_1, drive3$MeanPP_Seg0_1,
                    drive2$StdPP_Seg1, drive3$StdPP_Seg1,
                    drive2$StdPP_Seg0_1, drive3$StdPP_Seg0_1,
                    dfSeg$Seg1
                  )
combinedDf_Seg2 <- cbind(drive4, 
                    drive1$MeanPP_Seg0, 
                    drive2$MeanPP_Seg2, drive3$MeanPP_Seg2, 
                    drive2$MeanPP_Seg0_2, drive3$MeanPP_Seg0_2,
                    drive2$StdPP_Seg2, drive3$StdPP_Seg2,
                    drive2$StdPP_Seg0_2, drive3$StdPP_Seg0_2,
                    dfSeg$Seg2
                  )
combinedDf_Seg3 <- cbind(drive4, 
                    drive1$MeanPP_Seg0, 
                    drive2$MeanPP_Seg3, drive3$MeanPP_Seg3, 
                    drive2$MeanPP_Seg0_3, drive3$MeanPP_Seg0_3,
                    drive2$StdPP_Seg3, drive3$StdPP_Seg3,
                    drive2$StdPP_Seg0_3, drive3$StdPP_Seg0_3,
                    dfSeg$Seg3
                  )
combinedDf_Seg4 <- cbind(drive4, 
                    drive1$MeanPP_Seg0, 
                    drive2$MeanPP_Seg4, drive3$MeanPP_Seg4, 
                    drive2$MeanPP_Seg0_4, drive3$MeanPP_Seg0_4,
                    drive2$StdPP_Seg4, drive3$StdPP_Seg4,
                    drive2$StdPP_Seg0_4, drive3$StdPP_Seg0_4,
                    dfSeg$Seg4
                  )

names(combinedDf_Seg1) <- c(names(drive4), 
                       "PP_Dev_1_Turning",
                       "PP_Dev_2_Straight", "PP_Dev_3_Straight", 
                       "PP_Dev_2_Turning", "PP_Dev_3_Turning", 
                       "Std_PP_2_Straight", "Std_PP_3_Straight", 
                       "Std_PP_2_Turning", "Std_PP_3_Turning", 
                       "Segment")
names(combinedDf_Seg2) <- c(names(drive4), 
                       "PP_Dev_1_Turning",
                       "PP_Dev_2_Straight", "PP_Dev_3_Straight", 
                       "PP_Dev_2_Turning", "PP_Dev_3_Turning", 
                       "Std_PP_2_Straight", "Std_PP_3_Straight", 
                       "Std_PP_2_Turning", "Std_PP_3_Turning", 
                       "Segment")
names(combinedDf_Seg3) <- c(names(drive4), 
                       "PP_Dev_1_Turning",
                       "PP_Dev_2_Straight", "PP_Dev_3_Straight", 
                       "PP_Dev_2_Turning", "PP_Dev_3_Turning", 
                       "Std_PP_2_Straight", "Std_PP_3_Straight", 
                       "Std_PP_2_Turning", "Std_PP_3_Turning", 
                       "Segment")
names(combinedDf_Seg4) <- c(names(drive4), 
                       "PP_Dev_1_Turning",
                       "PP_Dev_2_Straight", "PP_Dev_3_Straight", 
                       "PP_Dev_2_Turning", "PP_Dev_3_Turning", 
                       "Std_PP_2_Straight", "Std_PP_3_Straight", 
                       "Std_PP_2_Turning", "Std_PP_3_Turning", 
                       "Segment")

# combinedDf_Seg1$Subject <- paste0(as.factor(combinedDf_Seg1$Subject), ".S1")
# combinedDf_Seg2$Subject <- paste0(as.factor(combinedDf_Seg2$Subject), ".S2")
# combinedDf_Seg3$Subject <- paste0(as.factor(combinedDf_Seg3$Subject), ".S3")
# combinedDf_Seg4$Subject <- paste0(as.factor(combinedDf_Seg4$Subject), ".S4")

combinedDf <- rbind(combinedDf_Seg1, combinedDf_Seg2, combinedDf_Seg3, combinedDf_Seg4)

combinedDf$Subject <- paste0("#", str_pad(combinedDf$Subject, 2, pad="0"))
combinedDf$Segment <- as.factor(combinedDf$Segment)
```

```{r}
combinedDf_NoStressor <- combinedDf[combinedDf$Activity == "NO",]
combinedDf_Cognitive <- combinedDf[combinedDf$Activity == "C",]
combinedDf_Motoric <- combinedDf[combinedDf$Activity == "M",]

# combinedDf_NoStressor$Subject <- as.factor(combinedDf_NoStressor$Subject)
# combinedDf_Cognitive$Subject <- as.factor(combinedDf_Cognitive$Subject)
# combinedDf_Motoric$Subject <- as.factor(combinedDf_Motoric$Subject)
```

```{r}
COLOR_NORMAL <- list(color='rgb(120,120,120)')
COLOR_COGNITIVE <- list(color='rgb(158,202,225)')
COLOR_MOTORIC <- list(color='rgb(58,200,225)')
COLOR_FAILURE <- list(color='red')

THRESHOLD_MILD = 0.07
THRESHOLD_EXTREME = 0.2

MARKER_LINE_MILD = list(color="blue")
MARKER_LINE_EXTREME = list(color="red")
```

```{r}
library(nlme)
combinedDf$Subject = as.factor(combinedDf$Subject)
combinedDf$Activity = as.factor(combinedDf$Activity)
combinedDf$PP_Dev_Group = ifelse(combinedDf$PP_Dev > THRESHOLD_MILD, 1, 0)
```

```{r}
model = lm(PP_Dev ~ 
              abs(PP_Dev_2_Straight) + 
              abs(PP_Dev_3_Straight) +
              abs(PP_Dev_2_Turning) + 
              abs(PP_Dev_3_Turning) + 
              Std_PP_2_Straight + 
              Std_PP_3_Straight + 
              Std_PP_2_Turning +
              Std_PP_3_Turning +
              factor(Activity),
            data=combinedDf,
            random=~1|as.factor(Subject),
            method="REML")

# anova(model)
summary(model)
plot(model)
```

## Machine Learning

```{r}
combinedDf$PP_Dev <- NULL

combinedDf$Subject <- NULL
combinedDf$Activity_NO <- ifelse(combinedDf$Activity == "NO", 1, 0)
combinedDf$Activity_C <- ifelse(combinedDf$Activity == "C", 1, 0)
combinedDf$Activity_M <- ifelse(combinedDf$Activity == "M", 1, 0)
combinedDf$Activity <- NULL

combinedDf$PP_Dev_1_Turning <-NULL
combinedDf$PP_Prior <-NULL

combinedDf$InSegment1 <- ifelse(combinedDf$Segment == 1, 1, 0)
combinedDf$InSegment2 <- ifelse(combinedDf$Segment == 2, 1, 0)
combinedDf$InSegment3 <- ifelse(combinedDf$Segment == 3, 1, 0)
combinedDf$InSegment4 <- ifelse(combinedDf$Segment == 4, 1, 0)
combinedDf$Segment <- NULL

combinedDf$Class <- ifelse(combinedDf$PP_Dev_Group == 1, T, F)
combinedDf$PP_Dev_Group <- NULL
```

```{r}
# library(mefa)
# combinedDf <- rep(combinedDf, 10) 
```

```{r}
n_folds <- 5
params <- param <- list(objective       = "binary:logistic", 
               booster          = "gbtree",
               eval_metric      = "auc",
               eta              = 0.1,
               max_depth        = 15,
               alpha            = 1,
               lambda           = 0,
               gamma            = 0.45,
               min_child_weight = 0.3,
               subsample        = 1,
               colsample_bytree = 1)
           
# XGBoost Model         
xgb_m <- xgb.cv(   params               = param,
                  data = as.matrix(combinedDf %>% select(-Class)) ,
                  label =  combinedDf$Class,
                  nrounds             = 100,
                  verbose             = F,
                  prediction          = T,
                  maximize            = F, # Change this value to F will help to run with more itineration
                  nfold               = n_folds,
                  metrics             = c("auc", "error"),
                  early_stopping_rounds = 50,
                  stratified            = T,
                  scale_pos_weight      = 1/2)

# xgb_m$evaluation_log[xgb_m$best_iteration,"test_auc_mean"]
xgb_m$evaluation_log[xgb_m$best_iteration,]

```

```{r}
# Prediction
combinedDf$clsPred <- round(xgb_m$pred)

computePerformanceResults <- function(sdat){
  sdat = sdat[complete.cases(sdat),]
  acc = sum(sdat[,1] == sdat[,2])/nrow(sdat)
  conf_mat = table(sdat)
  specif = conf_mat[1,1]/sum(conf_mat[,1])
  sensiv = conf_mat[2,2]/sum(conf_mat[,2])
  preci =  conf_mat[2,2]/sum(conf_mat[2,])
  npv =    conf_mat[1,1]/sum(conf_mat[1,])
  return(c(acc,specif,sensiv,preci,npv))
}

# Get average performance
performance <- computePerformanceResults(combinedDf %>% select(Class, clsPred))
acc <- performance[1]
prec <- performance[4]
recall <- performance[3]
spec <- performance[2]
npv <- performance[5]
f1 <- (2 * recall * prec) / (recall + prec)
auc <- as.numeric(xgb_m$evaluation_log[xgb_m$best_iteration, "test_auc_mean"])

print(paste("Accuracy=", round(acc, 2)))
print(paste("Precision=", round(prec, 2)))
print(paste("Recall=", round(recall, 2)))
print(paste("Specificity=", round(spec, 2)))
print(paste("NPV=", round(npv, 2)))
print(paste("F1=", round(f1, 2)))
print(paste("AUC=", round(auc, 2)))
```

```{r}
# Importance
combinedDf$clsPred <- NULL
bst <- xgboost(   params               = param,
                  data = as.matrix(combinedDf %>% select(-Class)) ,
                  label =  combinedDf$Class,
                  nrounds             = 100,
                  verbose             = F,
                  prediction          = T,
                  maximize            = F, # Change this value to F will help to run with more itineration
                  nfold               = n_folds,
                  metrics             = c("auc", "error"),
                  early_stopping_rounds = 50,
                  stratified            = T,
                  scale_pos_weight      = 1/2)
importanceDf <- xgb.importance(colnames(combinedDf %>% select(-Class) ), model = bst)
print(importanceDf)
```

```{r}
library(pROC)

dfROC <- pROC::roc(response = ifelse(combinedDf$Class==T, 1, 0),
               predictor = round(xgb_m$pred),
               levels=c(0, 1), direction = "<")

# it = which.max(xgb_m$evaluation_log$test_auc_mean)
# best.iter = xgb_m$evaluation_log$iter[it]
# best.iter 

plot(pROC::roc(response = ifelse(combinedDf$Class==T, 1, 0),
               predictor = round(xgb_m$pred),
               levels=c(0, 1), direction = "<"), 
     legacy.axes = TRUE,
     main="ROC Curve", 
     lwd=1.5) 
```

# Important features
```{r}
# Eleminate #5 who has an exceptional data to find a better threshold
stdPP3 <- sort(importantFeaturesDf$Std_PP_3, decreasing = T)[2:length(importantFeaturesDf$Std_PP_3)]
stdPP3Array <- matrix(stdPP3 ,nrow = 1,ncol = length(stdPP3))
  
maxStdPP3 <- sort(importantFeaturesDf$Std_PP_3, decreasing = T)[2]
PP_DEV_3_THRESHOLD <- otsu(stdPP3Array, range=c(min(stdPP3), maxStdPP3)) # Expected Threshold = 0.088
print(paste0('Threshold: ', PP_DEV_3_THRESHOLD))

importantFeaturesDf$PP_Dev_3_Group <- ifelse(importantFeaturesDf$Std_PP_3 > PP_DEV_3_THRESHOLD, 1, 0)
write.csv(importantFeaturesDf, "../outputs/importantFeatures.csv")
```


### Plot feature importance
```{r}
yAxis <- list(
  title = 'Importance',
  range=c(0.0, 1.0)
)
xAxis <- list(
  title = ''
)
importanceDf$FeatureName <- lapply(importanceDf$Feature, function(x) {
  ifelse(x=="Std_PP_3", "SD of Arousal\n in Drive M", 
         ifelse(x=="PP_Dev_2_Turning", "Arousal in Drive C\nat turning segments", 
            ifelse(x=="Activity_C", "Prior stressor\n is Cognitive", x)))
})

fig_Importance <- plot_ly(importanceDf, x = ~FeatureName, y = ~Gain, type = 'bar', name = 'Gain', width=600) %>%
  add_trace(y = ~Cover, name = 'Cover') %>% 
  add_trace(y = ~Frequency, name = 'Frequency') %>% 
  layout(yaxis = yAxis, xaxis=xAxis, barmode = 'group', title="Feature Importance") %>% 
  config(.Last.value, mathjax = 'cdn')

htmltools::tagList(fig_Importance)
```




